1   package org.apache.lucene.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  /*
21   * Some of this code came from the excellent Unicode
22   * conversion examples from:
23   *
24   *   http://www.unicode.org/Public/PROGRAMS/CVTUTF
25   *
26   * Full Copyright for that code follows:
27  */
28  
29  /*
30   * Copyright 2001-2004 Unicode, Inc.
31   * 
32   * Disclaimer
33   * 
34   * This source code is provided as is by Unicode, Inc. No claims are
35   * made as to fitness for any particular purpose. No warranties of any
36   * kind are expressed or implied. The recipient agrees to determine
37   * applicability of information provided. If this file has been
38   * purchased on magnetic or optical media from Unicode, Inc., the
39   * sole remedy for any claim will be exchange of defective media
40   * within 90 days of receipt.
41   * 
42   * Limitations on Rights to Redistribute This Code
43   * 
44   * Unicode, Inc. hereby grants the right to freely use the information
45   * supplied in this file in the creation of products supporting the
46   * Unicode Standard, and to make copies of this file in any form
47   * for internal or external distribution as long as this notice
48   * remains attached.
49   */
50  
51  /*
52   * Additional code came from the IBM ICU library.
53   *
54   *  http://www.icu-project.org
55   *
56   * Full Copyright for that code follows.
57   */
58  
59  /*
60   * Copyright (C) 1999-2010, International Business Machines
61   * Corporation and others.  All Rights Reserved.
62   *
63   * Permission is hereby granted, free of charge, to any person obtaining a copy
64   * of this software and associated documentation files (the "Software"), to deal
65   * in the Software without restriction, including without limitation the rights
66   * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
67   * Software, and to permit persons to whom the Software is furnished to do so,
68   * provided that the above copyright notice(s) and this permission notice appear
69   * in all copies of the Software and that both the above copyright notice(s) and
70   * this permission notice appear in supporting documentation.
71   *
72   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
75   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
76   * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
77   * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
78   * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
79   * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
80   *
81   * Except as contained in this notice, the name of a copyright holder shall not
82   * be used in advertising or otherwise to promote the sale, use or other
83   * dealings in this Software without prior written authorization of the
84   * copyright holder.
85   */
86  
87  public class TestUnicodeUtil extends LuceneTestCase {
88    public void testCodePointCount() {
89      // Check invalid codepoints.
90      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0x80, 'z', 'z', 'z'));
91      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xc0 - 1, 'z', 'z', 'z'));
92      // Check 5-byte and longer sequences.
93      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf8, 'z', 'z', 'z'));
94      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xfc, 'z', 'z', 'z'));
95      // Check improperly terminated codepoints.
96      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xc2));
97      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xe2));
98      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xe2, 0x82));
99      assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf0));
100     assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf0, 0xa4));
101     assertcodePointCountThrowsAssertionOn(asByteArray('z', 0xf0, 0xa4, 0xad));
102 
103     // Check some typical examples (multibyte).
104     assertEquals(0, UnicodeUtil.codePointCount(new BytesRef(asByteArray())));
105     assertEquals(3, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 'z', 'z'))));
106     assertEquals(2, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 0xc2, 0xa2))));
107     assertEquals(2, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 0xe2, 0x82, 0xac))));
108     assertEquals(2, UnicodeUtil.codePointCount(new BytesRef(asByteArray('z', 0xf0, 0xa4, 0xad, 0xa2))));
109 
110     // And do some random stuff.
111     int num = atLeast(50000);
112     for (int i = 0; i < num; i++) {
113       final String s = TestUtil.randomUnicodeString(random());
114       final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
115       final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
116       assertEquals(s.codePointCount(0, s.length()),
117                    UnicodeUtil.codePointCount(new BytesRef(utf8, 0, utf8Len)));
118     }
119   }
120 
121   private byte[] asByteArray(int... ints) {
122     byte [] asByteArray = new byte [ints.length];
123     for (int i = 0; i < ints.length; i++) {
124       asByteArray[i] = (byte) ints[i];
125     }
126     return asByteArray;
127   }
128 
129   private void assertcodePointCountThrowsAssertionOn(byte... bytes) {
130     boolean threwAssertion = false;
131     try {
132       UnicodeUtil.codePointCount(new BytesRef(bytes));
133     } catch (IllegalArgumentException e) {
134       threwAssertion = true;
135     }
136     assertTrue(threwAssertion);
137   }
138 
139   public void testUTF8toUTF32() {
140     int[] utf32 = new int[0];
141     int[] codePoints = new int[20];
142     int num = atLeast(50000);
143     for (int i = 0; i < num; i++) {
144       final String s = TestUtil.randomUnicodeString(random());
145       final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
146       final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
147       utf32 = ArrayUtil.grow(utf32, utf8Len);
148       final int utf32Len = UnicodeUtil.UTF8toUTF32(new BytesRef(utf8, 0, utf8Len), utf32);
149       
150       int charUpto = 0;
151       int intUpto = 0;
152       while(charUpto < s.length()) {
153         final int cp = s.codePointAt(charUpto);
154         codePoints[intUpto++] = cp;
155         charUpto += Character.charCount(cp);
156       }
157       if (!ArrayUtil.equals(codePoints, 0, utf32, 0, intUpto)) {
158         System.out.println("FAILED");
159         for(int j=0;j<s.length();j++) {
160           System.out.println("  char[" + j + "]=" + Integer.toHexString(s.charAt(j)));
161         }
162         System.out.println();
163         assertEquals(intUpto, utf32Len);
164         for(int j=0;j<intUpto;j++) {
165           System.out.println("  " + Integer.toHexString(utf32[j]) + " vs " + Integer.toHexString(codePoints[j]));
166         }
167         fail("mismatch");
168       }
169     }
170   }
171 
172   public void testNewString() {
173     final int[] codePoints = {
174         Character.toCodePoint(Character.MIN_HIGH_SURROGATE,
175             Character.MAX_LOW_SURROGATE),
176         Character.toCodePoint(Character.MAX_HIGH_SURROGATE,
177             Character.MIN_LOW_SURROGATE), Character.MAX_HIGH_SURROGATE, 'A',
178         -1,};
179 
180     final String cpString = "" + Character.MIN_HIGH_SURROGATE
181         + Character.MAX_LOW_SURROGATE + Character.MAX_HIGH_SURROGATE
182         + Character.MIN_LOW_SURROGATE + Character.MAX_HIGH_SURROGATE + 'A';
183 
184     final int[][] tests = { {0, 1, 0, 2}, {0, 2, 0, 4}, {1, 1, 2, 2},
185         {1, 2, 2, 3}, {1, 3, 2, 4}, {2, 2, 4, 2}, {2, 3, 0, -1}, {4, 5, 0, -1},
186         {3, -1, 0, -1}};
187 
188     for (int i = 0; i < tests.length; ++i) {
189       int[] t = tests[i];
190       int s = t[0];
191       int c = t[1];
192       int rs = t[2];
193       int rc = t[3];
194 
195       try {
196         String str = UnicodeUtil.newString(codePoints, s, c);
197         assertFalse(rc == -1);
198         assertEquals(cpString.substring(rs, rs + rc), str);
199         continue;
200       } catch (IndexOutOfBoundsException | IllegalArgumentException e1) {
201         // Ignored.
202       }
203       assertTrue(rc == -1);
204     }
205   }
206   
207   public void testUTF8UTF16CharsRef() {
208     int num = atLeast(3989);
209     for (int i = 0; i < num; i++) {
210       String unicode = TestUtil.randomRealisticUnicodeString(random());
211       BytesRef ref = new BytesRef(unicode);
212       CharsRefBuilder cRef = new CharsRefBuilder();
213       cRef.copyUTF8Bytes(ref);
214       assertEquals(cRef.toString(), unicode);
215     }
216   }
217 
218   public void testCalcUTF16toUTF8Length() {
219     int num = atLeast(5000);
220     for (int i = 0; i < num; i++) {
221       String unicode = TestUtil.randomUnicodeString(random());
222       byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
223       int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
224       assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
225     }
226   }
227 }